#!/usr/bin/python
# -*- encoding: utf-8 -*-

# make the clustering class-specific

import sys,os,re,glob,math,glob,signal,traceback
if "DISPLAY" not in os.environ:
    import matplotlib
    matplotlib.use("AGG")
from scipy.ndimage import interpolation
from pylab import *
from optparse import OptionParser
from multiprocessing import Pool
import ocrolib
from ocrolib import number_of_processors,die
from scipy import stats
import ocrofst
from ocrofst import fstutils

signal.signal(signal.SIGINT,lambda *args:sys.exit(1))

parser = OptionParser("""
usage: 
%prog [-l langmod] file.txt ...

""")

parser.add_option("-g","--grep",help="show matching lines",action="store_true")
parser.add_option("-n","--nonmatching",help="show nonmatching lines",action="store_true")
parser.add_option("-l","--langmod",help="language model",default=None)
parser.add_option("-b","--beam",help="beam width",type=int,default=1000)
(options,args) = parser.parse_args()

count = 0
chars = 0
fail = 0
cfail = 0
totalcost = 0

lmodel = ocrofst.OcroFST()
lmodel.load(options.langmod)

for fname in args:
    with open(fname) as stream:
        for line in stream.readlines():
            line = line[:-1]
            line = unicode(line.decode("utf-8"))
            line = line.strip()
            line = re.sub(ur'\s+',' ',line)
            line = re.sub(ur'[_~]','',line)
            count += 1
            chars += len(line)

            lattice = fstutils.simple_line_fst(line)
            v1,v2,ins,outs,costs = ocrofst.beam_search(lattice,lmodel,options.beam)

            cost = sum(costs)

            if cost>1e38:
                fail += 1
                cfail += len(line)
                if options.nonmatching:
                    print "%8.2f"%cost,line.encode("utf-8")
            else:
                totalcost += cost
                if options.grep:
                    print "%8.2f"%cost,line.encode("utf-8")

print "count",count
print "chars",chars
print "fail   %8d %8.2f%%"%(fail,fail*100.0/count)
print "cfail  %8d %8.2f%%"%(cfail,cfail*100.0/chars)
print "cost   %8.1f %8.2f"%(totalcost,totalcost*1.0/count)
